Model LUAD stage

Last modified 02:27 PM on Jan 15, 2016. This document, R session image, version history, knitr cache, figures, and other associated datasets are located in /inside/grotto/blin/trna-markers/luad/predict/.

library(plyr)
library(glmnet)
library(ROCR)
library(GenomicRanges)
library(stringr)
set.seed(12)
load('/inside/home/blin/grotto/data/hg19-srnas.RData')
load('/inside/home/blin/grotto/trna-markers/process-reads/luad-counts.RData')
luad_clinical <- luad_clinical[match(colnames(luad_adjusted_counts), luad_clinical$barcode), ]
ntrials <- 150

Generic function for classification of samples into discrete classes

setupTrainingTestingSets <- function(metadata, covariate, counts) {
  # metadata: filtered metadata for covariate of interest (e.g. no [Not Applicable] entries)
  # covariate: name of covariate (must be a column in metadata)
  # counts must be filtered for covariate of interest
  random_sample <- unlist(sapply(levels(metadata[, covariate]), function(class) {
    num_samples <- round(sum(metadata[, covariate] == class)/2)
    sample(metadata[metadata[, covariate] == class, ]$participant_id, num_samples)
  }))
  training_counts <-  counts[, metadata$participant_id %in% random_sample]
  training_metadata <- metadata[metadata$participant_id %in% random_sample, ]
  testing_counts <- counts[, !(metadata$participant_id %in% random_sample)]
  testing_metadata <- metadata[!(metadata$participant_id %in% random_sample), ]
  list(training_counts = training_counts, training_metadata = training_metadata, testing_counts = testing_counts, testing_metadata = testing_metadata, participant_ids = random_sample)
}

buildTestGlm <- function(dataset, features, covariate, randomize = FALSE) {
  training_counts <- t(dataset$training_counts[rownames(dataset$training_counts) %in% features, ])
  training_response <- dataset$training_metadata[match(colnames(dataset$training_counts), dataset$training_metadata$barcode), ][, covariate]
  testing_counts <- t(dataset$testing_counts[rownames(dataset$testing_counts) %in% features, ])
  testing_response <- dataset$testing_metadata[match(colnames(dataset$testing_counts), dataset$testing_metadata$barcode), ][, covariate]
  model <- glmnet(training_counts, training_response, family = "binomial")
  chosen_model <- which.min(abs(model$df - 20)) # select for ~ 20 features)
  pred <- predict(model, newx = testing_counts, s = model$lambda[chosen_model])
  prob <- prediction(pred, testing_response)
  if (randomize) prob <- prediction(pred, sample(testing_response))
  perf <- performance(prob, "tpr", "fpr")
  roc <- data.frame(TPR = unlist(perf@y.values), FPR = unlist(perf@x.values))
  coef <- model$beta[, chosen_model]
  coef <- coef[coef != 0]
  auc <- unlist(performance(prob, "auc")@y.values)
  auc <- round(auc, 3)
  list(roc = roc, coef = coef, auc = auc)
}

parseGlms <- function(glms, class) ldply(1:length(glms), function(i) data.frame(glms[[i]]$roc, Trial = i, AUC = glms[[i]]$auc, Class = class))

plotGlms <- function(roc_df) ggplot(roc_df) + geom_line(aes(x = FPR, y = TPR, color = Class, group = paste0(Trial, Class)), alpha = 0.3) + geom_abline(intercept = 0, slope = 1, colour = "gray") + ylab("TPR") + xlab("FPR") + facet_wrap(~ Class, nrow = 2) + guides(color = FALSE)

plotMeanAucs <- function(all_feature_glms) {
  mean_aucs <- lapply(all_feature_glms, function(glms) {
    aucs <- lapply(1:length(glms), function(i) glms[[i]]$auc)
    mean(unlist(aucs))
  })
  df <- data.frame(AUC = unlist(mean_aucs), Class = as.factor(c("tsRNA", "miRNA", "snoRNA", "piRNA", "tRNA half", "tRNA")))
  ggplot(df) + geom_bar(aes(x = Class, y = AUC), stat = "identity") + ggtitle("Mean AUC by sRNA type")
}
buildTestCompareGlms <- function(metadata, covariate, counts, ntrials) {
  # metadata: filtered metadata for covariate of interest
  # counts: filtered counts containing only sample with covariate of interest

  # set up feature sets
  load('/inside/home/blin/grotto/data/hg19-srnas.RData')
  tsrnas <- unique(srnas[srnas$class %in% c("trf-1", "trf-3", "trf-5", "actRF-3", "actRF-5", "trailer")]$tx_name)
  mirnas <- unique(srnas[str_detect(srnas$class, "mi")]$tx_name)
  snornas <- unique(srnas[srnas$class == "snoRNA"]$tx_name)
  pirnas <- unique(srnas[srnas$class == "piRNA"]$tx_name)
  halves <- unique(srnas[srnas$class %in% c("fivehalf", "threehalf")]$tx_name)
  trnas <- unique(srnas[srnas$class == "tRNA"]$tx_name)

  # create training/testing sets and feature sets
  system(paste0('echo Setting up training/testing sets'))
  datasets <- lapply(1:ntrials, function(i) setupTrainingTestingSets(metadata, covariate, counts))


  # build, test on scrambled data (control)
  system(paste0('echo Building/testing models - control'))
  control_tsrna_glms <- lapply(datasets, buildTestGlm, features = tsrnas, covariate = covariate, randomize = TRUE) # the same models are built twice, but the code is a lot cleaner this way
  control_mirna_glms <- lapply(datasets, buildTestGlm, features = mirnas, covariate = covariate, randomize = TRUE)
  control_feature_glms <- list(control_tsrna_glms, control_mirna_glms)
  control_roc <- ldply(mapply(parseGlms, glms = control_feature_glms, class = c("tsRNA, permuted", "miRNA, permuted"), SIMPLIFY = FALSE), identity)

  # build and predict testing data
  system(paste0('echo Building/testing models'))
  all_feature_glms <- lapply(list(tsrnas, mirnas, snornas, pirnas, halves, trnas), function(features) {
    lapply(datasets, buildTestGlm, features = features, covariate = covariate, randomize = FALSE)
    })
  roc <- ldply(mapply(parseGlms, glms = all_feature_glms, class = c("tsRNA", "miRNA", "snoRNA", "piRNA", "tRNA half", "tRNA"), SIMPLIFY = FALSE), identity)
  roc <- rbind(roc, control_roc)
  roc$Class <- factor(roc$Class, levels = c("tsRNA", "miRNA", "snoRNA", "tsRNA, permuted", "piRNA", "tRNA half", "tRNA", "miRNA, permuted"))

  # create plots
  system(paste0('echo Creating summary plot'))
  plot <- plotGlms(roc)

  # create summary object
  summary <- list()
  summary$glms <- all_feature_glms
  summary$control_glms <- control_feature_glms
  summary$roc <- roc
  summary$plot <- plot
  summary$samples <- lapply(datasets, function(dataset) unname(dataset$participant_ids))
  summary
}
metadata <- luad_clinical
metadata$sample_type <- as.factor(metadata$sample_type)
sample_type <- buildTestCompareGlms(metadata = metadata, covariate = "sample_type", counts = luad_adjusted_counts, ntrials = ntrials)
sample_type$plot

plot of chunk sample-type-plot

plotMeanAucs(sample_type$glms)

plot of chunk sample-type-auc-plot

metadata <- luad_clinical[luad_clinical$m_stage %in% c("M0", "M1", "M1a", "M1b"), ]
counts <- luad_adjusted_counts[, colnames(luad_adjusted_counts) %in% metadata$barcode]
metadata$metastasis <- as.factor(ifelse(metadata$m_stage == "M0", "No metastasis" , "Metastasis"))
metastasis <- buildTestCompareGlms(metadata = metadata, covariate = "metastasis", counts = counts, ntrials = ntrials)
metastasis$plot

plot of chunk metastasis-plot

plotMeanAucs(metastasis$glms)

plot of chunk metastasis-auc-plot

# stage I, IA vs stage IIIA, IIIB, IV - essentially spreading vs non-spreading
metadata <- luad_clinical[luad_clinical$stage %in% c("Stage I", "Stage IA", "Stage IIIA", "Stage IIIB", "Stage IV"), ]
counts <- luad_adjusted_counts[, colnames(luad_adjusted_counts) %in% metadata$barcode]
metadata$stage <- as.factor(ifelse(metadata$stage %in% c("Stage I", "Stage IA"), "Early stage" , "Late stage"))
stage_coarse <- buildTestCompareGlms(metadata = metadata, covariate = "stage", counts = counts, ntrials = ntrials)
stage_coarse$plot

plot of chunk stage-coarse-plot

plotMeanAucs(stage_coarse$glms)

plot of chunk stage-coarse-auc-plot

# stage I, IA, vs stage II, IIA, IIB, IIIA, IIIB - early progression
metadata <- luad_clinical[luad_clinical$stage != "[Not Available]", ]
counts <- luad_adjusted_counts[, colnames(luad_adjusted_counts) %in% metadata$barcode]
metadata$stage <- as.factor(ifelse(metadata$stage %in% c("Stage I", "Stage IA", "Stage IB"), "Not progressed" , "Progressed"))
early_progression <- buildTestCompareGlms(metadata = metadata, covariate = "stage", counts = counts, ntrials = ntrials)
early_progression$plot

plot of chunk early-progression-plot

plotMeanAucs(early_progression$glms)

plot of chunk early-progression-auc-plot

save.session("predict.RSession")
## Saving search path..
## Saving list of loaded packages..
## Saving all data...
## Done.